""""
Prepross the data to the format needed and write it to csv files for later use.

Author: Meng Zhang
Date: January 2024

Input: Data/sessionsdata_anonym.csv
Output:
    all_states.csv
    RL_trasition_samples_with_empty_next_states.csv
    RL_trasition_weighted_reward.csv
    RL_trasition_samples.csv
"""

import pandas as pd
import numpy as np
import Utils as util

FILE_PATH_TO_SESSIONS_DATA = 'Data/sessionsdata_anonym.csv'
NUM_STATE_FEATURES = 11

dfs = pd.read_csv(FILE_PATH_TO_SESSIONS_DATA)
new_dfs = pd.DataFrame()
new_dfs = dfs.pivot_table(index=['rand_id', 'session_num'], columns='response_type', values='response_value', aggfunc=lambda x: x.iloc[0])
new_dfs = new_dfs.drop(columns=['activity_experience_mod_slot', 'activity_experience_slot', 'activity_new_index', 'mood'])
new_dfs.to_csv("all_states.csv", index = False)

# Transform the states features into binary
just_states = pd.DataFrame()
just_states = new_dfs.drop(columns=['cluster_new_index', 'effort', 'dropout_response'])
# just_states.to_csv("just_states.csv")
# calculate mean for each feature
just_states = just_states.astype(float)
feat_means = just_states.mean()


binary_states = []
for row in range(len(just_states)):
    binary_state_row = [1 if just_states.iloc[row, i] >= feat_means[i] else 0 for i in
                        range(NUM_STATE_FEATURES)]
    # binary_state_row = [1 if just_states.loc[row, 'state_' + str(1)] >= feat_means[0] else 0 ]
    binary_states.append(binary_state_row)
just_states["Binary_State"] = binary_states
new_dfs["Binary_State"] = binary_states
new_dfs.to_csv("all_states.csv")



# For training:
# Need to be in the form of (s_0, s_1, a ,r)
# States features need to be in binary form
# [binary_states_feature_for_current_state, binary_states_feature_for_next_sate,
# cluster_new_index_of_current_state, effort_current_state, dropout_response_current_state,
# rand_id, start_session, end_session]
next_states_features = []
all_states_df = pd.read_csv("all_states.csv")
# fill in null values
cols_to_fill = ['effort', 'dropout_response',
               'state_1', 'state_2', 'state_3', 'state_4',
               'state_5', 'state_6', 'state_7', 'state_8',
               'state_9','state_busy', 'state_energy']
all_states_df[cols_to_fill] = all_states_df[cols_to_fill].fillna(np.round(all_states_df[cols_to_fill].mean()))
for row in range(len(all_states_df) - 1):
    # if the rand_id is the same, it's the same person, append next's sessions state value to the current state
    if all_states_df.iloc[row, 0] == all_states_df.iloc[row + 1, 0]:
        next_states_features_row = all_states_df.iloc[row + 1, 16]
        next_states_features.append(next_states_features_row)
    else:
        next_states_features.append([])
next_states_features.append([])
all_states_df["Binary_State_Next_Session"] = next_states_features
all_states_df = all_states_df[['rand_id', 'session_num', 'cluster_new_index',
                               'dropout_response', 'effort', 'Binary_State', 'Binary_State_Next_Session']]

# dump the fifth session since it has no data for the next session
drop_empty_next_state = pd.DataFrame(columns=['rand_id', 'session_num', 'cluster_new_index',
                               'dropout_response', 'effort', 'Binary_State', 'Binary_State_Next_Session'])
for row in range(len(all_states_df) - 1):
    if all_states_df.iloc[row, 0] == all_states_df.iloc[row + 1, 0]:
        drop_empty_next_state.loc[len(drop_empty_next_state.index)] = all_states_df.loc[row]

drop_empty_next_state = drop_empty_next_state[['Binary_State', 'Binary_State_Next_Session',
                                                 'cluster_new_index', 'effort', 'dropout_response', 'rand_id', 'session_num']]

# line up the index for cluster_new_index for later use
drop_empty_next_state = util.line_up_with_index(drop_empty_next_state)
all_states_df = util.line_up_with_index(all_states_df)
# output the transition samples to the csv file
all_states_df.to_csv("RL_trasition_samples_with_empty_next_states.csv", index = False)
drop_empty_next_state.to_csv("RL_trasition_samples.csv", index = False)

# map the sum of weight reward and write it to csv
transitions_df, weighted_mean, weighted_min, weighted_max = util.weighted_sum_of_reward__for_transitions(0.5)
transitions_df.to_csv("RL_trasition_weighted_reward.csv", index = False)






